# ==============================================================================
# recoding-covariates.R
# author: Anselm Hager / Bernhard Clemm
# ==============================================================================

dir <- dirname(dirname(rstudioapi::getSourceEditorContext()$path))
source(paste0(dir, "/code/setup-packages.R"))

# POPULATION ===================================================================

## Data
population_wide <- read.csv(
  paste0(dir, "/data/covariates/population/population_all.csv"),
  sep = ";"
)

## Cleaning
population_wide <- population_wide %>%
  select(-starts_with("X")) %>%
  mutate(ags = as.character(ags)) %>%
  mutate(ags = ifelse(nchar(ags) == 7, paste0("0", ags), ags)) %>%
  mutate(across(c(
    starts_with("total_"),
    starts_with("men_"),
    starts_with("women_")
  ), ~ case_when(
    # See "Zeichenerklärung" on https://www.regionalstatistik.de
    . == "-" ~ "0", # "nichts vorhanden"
    . == "/" ~ NA_character_, # "keine Angaben, da Zahlenwert nicht sicher genug"
    . == "." ~ NA_character_, # "Zahlenwert unbekannt oder geheimzuhalten"
    TRUE ~ as.character(.)
  ))) %>%
  mutate(across(starts_with("area_"), ~ as.numeric(.)))

## Long format: ags, year, total, men, women
population <- population_wide %>%
  pivot_longer(c(
    starts_with("total_"),
    starts_with("men_"),
    starts_with("women_")
  )) %>%
  separate(name, sep = "_", into = c("type", "year")) %>%
  pivot_wider(names_from = "type", values_from = "value") %>%
  mutate(across(c(total, men, women), ~ as.numeric(.)),
    year = as.integer(year)
  ) %>%
  rename(
    pop_total = total,
    pop_men = men,
    pop_women = women
  )

## Manual corrections
population <- population %>%
  mutate(ags = case_when(
    ags == "02" ~ "02000000",
    ags == "11" ~ "11000000",
    TRUE ~ as.character(ags)
  ))

## Filter to municipalities
population <- population %>%
  filter(nchar(ags) %in% c(5, 8)) %>% # double-check
  mutate(ags = ifelse(nchar(ags) == 5, paste0(ags, "000"), ags))

# LAND USE =====================================================================

## Data
files <- list.files(
  path = paste0(dir, "/data/covariates/landuse"),
  pattern = "*.csv",
  full.names = TRUE, recursive = FALSE
)

landuse_list <- lapply(files, function(x) {
  dt <- read.csv(x, sep = ";")
  dt <- dt %>%
    select(-starts_with("X")) %>%
    mutate(
      ags = as.character(ags),
      year = as.integer(year)
    )
  return(dt)
})

## Cleaning
landuse <- bind_rows(landuse_list) %>%
  mutate(ags = ifelse(nchar(ags) == 7, paste0("0", ags), ags)) %>%
  mutate(across(starts_with("area_"), ~ case_when(
    # See "Zeichenerklärung" on https://www.regionalstatistik.de
    . == "-" ~ "0",
    . == "." ~ NA_character_,
    TRUE ~ as.character(.)
  ))) %>%
  mutate(across(starts_with("area_"), ~ as.numeric(.)))

## Manual corrections
landuse <- landuse %>%
  mutate(ags = case_when(
    ags == "02" ~ "02000000",
    ags == "11" ~ "11000000",
    TRUE ~ as.character(ags)
  ))

## Filter to municipalities
landuse <- landuse %>%
  filter(nchar(ags) %in% c(5, 8)) %>% # double-check
  mutate(ags = ifelse(nchar(ags) == 5, paste0(ags, "000"), ags))

# UNEMPLOYMENT =================================================================

## Data
files <- list.files(
  path = paste0(dir, "/data/covariates/employment"),
  pattern = "*.csv",
  full.names = TRUE, recursive = FALSE
)

employment_list <- lapply(files, function(x) {
  dt <- read.csv(x, sep = ";")
  year <- as.integer(gsub("_", "", str_extract(x, "_20[0-9]{2}"))) # double-check
  dt <- dt %>%
    select(-starts_with("X")) %>%
    mutate(
      ags = as.character(ags),
      year = year
    )
  return(dt)
})

## Cleaning
employment <- bind_rows(employment_list) %>%
  mutate(ags = ifelse(nchar(ags) == 7, paste0("0", ags), ags)) %>%
  mutate(across(starts_with("unempl_"), ~ case_when(
    # See "Zeichenerklärung" on https://www.regionalstatistik.de
    . == "-" ~ "0",
    . == "." ~ NA_character_,
    . == "x" ~ NA_character_, # Tabellenfach gesperrt, weil Aussage nicht sinnvoll
    TRUE ~ as.character(.)
  ))) %>%
  mutate(across(starts_with("area_"), ~ as.numeric(.)))

## Manual corrections
employment <- employment %>%
  mutate(ags = case_when(
    ags == "02" ~ "02000000",
    ags == "11" ~ "11000000",
    TRUE ~ as.character(ags)
  ))

## Filter to municipalities
employment <- employment %>%
  filter(nchar(ags) %in% c(5, 8)) %>% # double-check
  mutate(ags = ifelse(nchar(ags) == 5, paste0(ags, "000"), ags))

# COMBINE ======================================================================

covariates <- population %>%
  left_join(., landuse, by = c("year", "ags")) %>%
  left_join(., employment, by = c("year", "ags"))

# HARMONIZE AGS ================================================================

## Load 2021 AGS (this is the basis for analysis)
ags <- read.csv(
  paste0(dir, "/data/crosswalks/crosswalk.csv")
) %>%
  mutate(ags_2021 = ifelse(nchar(ags) == 7, paste0("0", ags), ags)) %>%
  distinct() %>%
  select(gemeinde, ags_2021)

files <- list.files(
  path = paste0(dir, "/data/crosswalks/crosswalk_historic"),
  pattern = "*.csv",
  full.names = TRUE, recursive = FALSE
)

crosswalk_list <- lapply(files, function(x) {
  dt <- read.csv(x, sep = ";")
  year <- as.integer(gsub("\\.", "", str_extract(x, "20[0-9]{2}\\.")))
  dt <- dt %>%
    select(-starts_with("X")) %>%
    mutate(
      ags_new = as.character(ags_new),
      ags_old = as.character(ags_old),
      type = as.character(type),
      year = year
    ) %>%
    mutate(across(c(ags_old, ags_new), ~ ifelse(nchar(.) == 7, paste0("0", .), .))) %>%
    filter(level == "Gemeinde") %>%
    filter(grepl("3|1", .$type))
  return(dt)
})

names(crosswalk_list) <- 2008:2021

ags_02 <- ags %>%
  left_join(., crosswalk_list[["2021"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2020" = ags_old),
  by = c("ags_2021" = "ags_new")
  ) %>%
  mutate(ags_2020 = ifelse(!is.na(ags_2020), ags_2020, ags_2021)) %>%
  left_join(., crosswalk_list[["2020"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2019" = ags_old),
  by = c("ags_2020" = "ags_new")
  ) %>%
  mutate(ags_2019 = ifelse(!is.na(ags_2019), ags_2019, ags_2020)) %>%
  left_join(., crosswalk_list[["2019"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2018" = ags_old),
  by = c("ags_2019" = "ags_new")
  ) %>%
  mutate(ags_2018 = ifelse(!is.na(ags_2018), ags_2018, ags_2019)) %>%
  left_join(., crosswalk_list[["2018"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2017" = ags_old),
  by = c("ags_2018" = "ags_new")
  ) %>%
  mutate(ags_2017 = ifelse(!is.na(ags_2017), ags_2017, ags_2018)) %>%
  left_join(., crosswalk_list[["2017"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2016" = ags_old),
  by = c("ags_2017" = "ags_new")
  ) %>%
  mutate(ags_2016 = ifelse(!is.na(ags_2016), ags_2016, ags_2017)) %>%
  left_join(., crosswalk_list[["2016"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2015" = ags_old),
  by = c("ags_2016" = "ags_new")
  ) %>%
  mutate(ags_2015 = ifelse(!is.na(ags_2015), ags_2015, ags_2016)) %>%
  left_join(., crosswalk_list[["2015"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2014" = ags_old),
  by = c("ags_2015" = "ags_new")
  ) %>%
  mutate(ags_2014 = ifelse(!is.na(ags_2014), ags_2014, ags_2015)) %>%
  left_join(., crosswalk_list[["2014"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2013" = ags_old),
  by = c("ags_2014" = "ags_new")
  ) %>%
  mutate(ags_2013 = ifelse(!is.na(ags_2013), ags_2013, ags_2014)) %>%
  left_join(., crosswalk_list[["2013"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2012" = ags_old),
  by = c("ags_2013" = "ags_new")
  ) %>%
  mutate(ags_2012 = ifelse(!is.na(ags_2012), ags_2012, ags_2013)) %>%
  left_join(., crosswalk_list[["2012"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2011" = ags_old),
  by = c("ags_2012" = "ags_new")
  ) %>%
  mutate(ags_2011 = ifelse(!is.na(ags_2011), ags_2011, ags_2012)) %>%
  left_join(., crosswalk_list[["2011"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2010" = ags_old),
  by = c("ags_2011" = "ags_new")
  ) %>%
  mutate(ags_2010 = ifelse(!is.na(ags_2010), ags_2010, ags_2011)) %>%
  left_join(., crosswalk_list[["2010"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2009" = ags_old),
  by = c("ags_2010" = "ags_new")
  ) %>%
  mutate(ags_2009 = ifelse(!is.na(ags_2009), ags_2009, ags_2010)) %>%
  left_join(., crosswalk_list[["2009"]] %>%
    select(ags_old, ags_new) %>%
    rename("ags_2008" = ags_old),
  by = c("ags_2009" = "ags_new")
  ) %>%
  mutate(ags_2008 = ifelse(!is.na(ags_2008), ags_2008, ags_2009)) %>%
  distinct()

ags_03 <- ags_02 %>%
  pivot_longer(ags_2020:ags_2008, values_to = "ags") %>%
  separate(name, sep = "_", into = c("name", "year")) %>%
  select(-name) %>%
  distinct() %>%
  mutate(year = as.integer(year))

## Manual corrections of ags
ags_04 <- ags_03 %>%
  mutate(ags = ifelse(
    # these cases supposedly changed AGS, but have data for the 2021 AGS across all years
    ags_2021 %in% c(
      "05334002", "07132074", "07132084", "07140502", "07140503", "07140504",
      "07235207", "09472111", "09472116", "09472127", "09472140", "09472143",
      "09472146", "09472155", "09472167", "09472197", "09472198", "09671135",
      "09671159", "15081030", "15083270", "15085330", "15086040", "15090435",
      "09377127", "09377133", "07232502", "15081135", "15081240", "15081455",
      "15085040", "15085235", "15091145", "07133006", "08415971", "09377113"
    ),
    ags_2021, ags
  )) %>%
  # these cases have the same AGS for some, but not all earlier years
  mutate(ags = case_when(
    ags_2021 == "15091391" & year %in% 2010:2013 ~ ags_2021,
    ags_2021 == "15088216" & year %in% 2010 ~ ags_2021,
    ags_2021 == "15091241" & year %in% 2010 ~ ags_2021,
    TRUE ~ as.character(ags)
  )) %>%
  unique()

covariates_02 <- ags_04 %>%
  left_join(., covariates %>% select(-gemeinde), by = c("year", "ags")) %>%
  mutate(across(starts_with("unempl"), ~ as.numeric(.)))

covariates_03 <- covariates_02 %>%
  group_by(ags_2021, gemeinde, year) %>%
  summarize(
    pop_total = ifelse(all(is.na(pop_total)), NA, sum(pop_total, na.rm = T)),
    pop_men = ifelse(all(is.na(pop_men)), NA, sum(pop_men, na.rm = T)),
    pop_women = ifelse(all(is.na(pop_women)), NA, sum(pop_women, na.rm = T)),
    area_total = ifelse(all(is.na(area_total)), NA, sum(area_total, na.rm = T)),
    area_settle = ifelse(all(is.na(area_settle)), NA, sum(area_settle, na.rm = T)),
    area_traffic = ifelse(all(is.na(area_traffic)), NA, sum(area_traffic, na.rm = T)),
    area_agri = ifelse(all(is.na(area_agri)), NA, sum(area_agri, na.rm = T)),
    area_heathen = ifelse(all(is.na(area_heathen)), NA, sum(area_heathen, na.rm = T)),
    area_forest = ifelse(all(is.na(area_forest)), NA, sum(area_forest, na.rm = T)),
    unempl_total = ifelse(all(is.na(unempl_total)), NA, sum(unempl_total, na.rm = T)),
    unempl_foreigners = ifelse(all(is.na(unempl_foreigners)), NA, sum(unempl_foreigners, na.rm = T)),
    unempl_handicapped = ifelse(all(is.na(unempl_handicapped)), NA, sum(unempl_handicapped, na.rm = T)),
    unempl_15.20y = ifelse(all(is.na(unempl_15.20y)), NA, sum(unempl_15.20y, na.rm = T)),
    unempl_15.25y = ifelse(all(is.na(unempl_15.25y)), NA, sum(unempl_15.25y, na.rm = T)),
    unempl_55.65y = ifelse(all(is.na(unempl_55.65y)), NA, sum(unempl_55.65y, na.rm = T)),
    unempl_15.25y = ifelse(all(is.na(unempl_15.25y)), NA, sum(unempl_15.25y, na.rm = T))
  )


# EXPORT ================================================================

write.csv(covariates_03, paste0(dir, "/data/covariates/covariates_empl_land.csv"),
  row.names = F
)
